Required packages

pkgs <- c("sf", "gstat", "mapview", "dplyr",
          "rvest", "xml2",
          "nomisr", "osmdata", "tidyr") 
lapply(pkgs, require, character.only = TRUE)

Session info

sessionInfo()
## R version 4.2.2 (2022-10-31 ucrt)
## Platform: x86_64-w64-mingw32/x64 (64-bit)
## Running under: Windows 10 x64 (build 19044)
## 
## Matrix products: default
## 
## locale:
## [1] LC_COLLATE=English_United Kingdom.utf8 
## [2] LC_CTYPE=English_United Kingdom.utf8   
## [3] LC_MONETARY=English_United Kingdom.utf8
## [4] LC_NUMERIC=C                           
## [5] LC_TIME=English_United Kingdom.utf8    
## 
## attached base packages:
## [1] stats     graphics  grDevices utils     datasets  methods   base     
## 
## other attached packages:
## [1] tidyr_1.3.0    osmdata_0.2.0  nomisr_0.4.7   xml2_1.3.3     rvest_1.0.3   
## [6] dplyr_1.1.0    mapview_2.11.0 gstat_2.0-9    sf_1.0-8      
## 
## loaded via a namespace (and not attached):
##  [1] Rcpp_1.0.10        lattice_0.20-45    FNN_1.1.3.1        png_0.1-7         
##  [5] class_7.3-20       zoo_1.8-11         digest_0.6.29      utf8_1.2.2        
##  [9] plyr_1.8.7         R6_2.5.1           rsdmx_0.6          stats4_4.2.2      
## [13] evaluate_0.20      e1071_1.7-11       httr_1.4.4         pillar_1.8.1      
## [17] rlang_1.1.0        rstudioapi_0.14    raster_3.6-3       jquerylib_0.1.4   
## [21] rmarkdown_2.20     webshot_0.5.4      htmlwidgets_1.5.4  munsell_0.5.0     
## [25] proxy_0.4-27       compiler_4.2.2     xfun_0.37          pkgconfig_2.0.3   
## [29] base64enc_0.1-3    htmltools_0.5.4    tidyselect_1.2.0   tibble_3.1.8      
## [33] intervals_0.15.2   codetools_0.2-18   XML_3.99-0.11      fansi_1.0.3       
## [37] spacetime_1.2-8    grid_4.2.2         jsonlite_1.8.4     satellite_1.0.4   
## [41] lifecycle_1.0.3    DBI_1.1.3          magrittr_2.0.3     units_0.8-0       
## [45] scales_1.2.1       KernSmooth_2.23-20 cli_3.4.1          cachem_1.0.7      
## [49] leaflet_2.1.1      sp_1.5-0           snakecase_0.11.0   bslib_0.4.2       
## [53] xts_0.12.1         generics_0.1.3     vctrs_0.6.0        tools_4.2.2       
## [57] leafem_0.2.0       glue_1.6.2         purrr_1.0.1        crosstalk_1.2.0   
## [61] fastmap_1.1.0      yaml_2.3.5         colorspace_2.0-3   terra_1.6-17      
## [65] classInt_0.4-8     knitr_1.42         sass_0.4.5
# Create subdir for data if not exists
dn <- "_data"
ifelse(dir.exists(dn), "Exists", dir.create(dn))
## [1] "Exists"

Web Scraping

This example is inspired by Biegert, Kühhirt, and Van Lancker (2023).

Football Stats

Let’s try to get some football player stats from FBREF. On the home page, we see a list of trending player pages: https://fbref.com/en/ Let’s see if we can get this information into R.

Inspecting the head shots of players give is the following xpath: //*[@id="players"]/div[3]

# Scrape the website
url <- "https://fbref.com/en/"
parsed <- read_html(url)

# Save xml (so we have the website locally)
write_xml(parsed, file = "_data/fbplayers.xml")
# Read back in
parsed <- read_html("_data/fbplayers.xml")

# Select the desired element
parsed.sub <- html_elements(parsed, xpath =  '//*[@id="players"]/div[3]')
trending.players <- html_elements(parsed.sub, "a")

trending.players
## {xml_nodeset (20)}
##  [1] <a href="/en/players/16380240/Mesut-Ozil">Mesut Özil</a>
##  [2] <a href="/en/players/d70ce98e/Lionel-Messi">Lionel Messi</a>
##  [3] <a href="/en/players/dea698d9/Cristiano-Ronaldo">Cristiano Ronaldo</a>
##  [4] <a href="/en/players/1f44ac21/Erling-Haaland">Erling Haaland</a>
##  [5] <a href="/en/players/6f806d99/Enzo-Le-Fee">Enzo Le Fée</a>
##  [6] <a href="/en/players/ad82197c/Axel-Disasi">Axel Disasi</a>
##  [7] <a href="/en/players/bc7dc64d/Bukayo-Saka">Bukayo Saka</a>
##  [8] <a href="/en/players/35a6e5c7/Gabriel-Veiga">Gabriel Veiga</a>
##  [9] <a href="/en/players/57d88cf9/Jude-Bellingham">Jude Bellingham</a>
## [10] <a href="/en/players/21a66f6a/Harry-Kane">Harry Kane</a>
## [11] <a href="/en/players/e46012d4/Kevin-De-Bruyne">Kevin De Bruyne</a>
## [12] <a href="/en/players/8c90fd7a/Victor-Osimhen">Victor Osimhen</a>
## [13] <a href="/en/players/69384e5d/Neymar">Neymar</a>
## [14] <a href="/en/players/dea88efd/Khvicha-Kvaratskhelia">Khvicha Kvaratskhel ...
## [15] <a href="/en/players/1c7012b8/Declan-Rice">Declan Rice</a>
## [16] <a href="/en/players/1bacc518/Frenkie-de-Jong">Frenkie de Jong</a>
## [17] <a href="/en/players/507c7bdf/Bruno-Fernandes">Bruno Fernandes</a>
## [18] <a href="/en/players/19cae58d/Gavi">Gavi</a>
## [19] <a href="/en/players/31822f8c/Folarin-Balogun">Folarin Balogun</a>
## [20] <a href="/en/players/42fd9c7f/Kylian-Mbappe">Kylian Mbappé</a>

We have extracted the set of players with the respective links. Now, we separate these into the names with html_text2() and the links using html_attrs().

# Data frame with names and links of players
trending.players.df <- data.frame(names = html_text2(trending.players),
                       links = unlist(html_attrs(trending.players)))

# Extend the links
trending.players.df$links <- paste0("https://fbref.com/", trending.players.df$links)

head(trending.players.df)
##               names                                                    links
## 1        Mesut Özil        https://fbref.com//en/players/16380240/Mesut-Ozil
## 2      Lionel Messi      https://fbref.com//en/players/d70ce98e/Lionel-Messi
## 3 Cristiano Ronaldo https://fbref.com//en/players/dea698d9/Cristiano-Ronaldo
## 4    Erling Haaland    https://fbref.com//en/players/1f44ac21/Erling-Haaland
## 5       Enzo Le Fée       https://fbref.com//en/players/6f806d99/Enzo-Le-Fee
## 6       Axel Disasi       https://fbref.com//en/players/ad82197c/Axel-Disasi

Exercise

  • Can you get some player characteristics for Lionel Messi?

  • Can you wrap this into a loop for multiple players? USE sys.sleep() TO PAUSE!

Now we have a time-series object of the most trending player’s seasonal performances.

head(player_stats.df)

API

This is from a short course Introduction to geospatial data and analysis

London shapefile (polygon)

Lets get some administrative boundaries for London from the London Datastore.

# Download zip file and unzip
tmpf <- tempfile()
boundary.link <- "https://data.london.gov.uk/download/statistical-gis-boundary-files-london/9ba8c833-6370-4b11-abdc-314aa020d5e0/statistical-gis-boundaries-london.zip"
download.file(boundary.link, tmpf)
unzip(zipfile = tmpf, exdir = paste0(dn))
unlink(tmpf)

# This is a shapefile
# We only need the MSOA layer for now
msoa.spdf <- st_read(dsn = paste0(dn, "/statistical-gis-boundaries-london/ESRI"),
                     layer = "MSOA_2011_London_gen_MHW" # Note: no file ending
                     )
## Reading layer `MSOA_2011_London_gen_MHW' from data source 
##   `C:\work\Lehre\Web_Scraping\02_Exercise\_data\statistical-gis-boundaries-london\ESRI' 
##   using driver `ESRI Shapefile'
## Simple feature collection with 983 features and 12 fields
## Geometry type: MULTIPOLYGON
## Dimension:     XY
## Bounding box:  xmin: 503574.2 ymin: 155850.8 xmax: 561956.7 ymax: 200933.6
## Projected CRS: OSGB 1936 / British National Grid
head(msoa.spdf)
## Simple feature collection with 6 features and 12 fields
## Geometry type: MULTIPOLYGON
## Dimension:     XY
## Bounding box:  xmin: 530966.7 ymin: 180510.7 xmax: 551943.8 ymax: 191139
## Projected CRS: OSGB 1936 / British National Grid
##    MSOA11CD                 MSOA11NM   LAD11CD              LAD11NM   RGN11CD
## 1 E02000001       City of London 001 E09000001       City of London E12000007
## 2 E02000002 Barking and Dagenham 001 E09000002 Barking and Dagenham E12000007
## 3 E02000003 Barking and Dagenham 002 E09000002 Barking and Dagenham E12000007
## 4 E02000004 Barking and Dagenham 003 E09000002 Barking and Dagenham E12000007
## 5 E02000005 Barking and Dagenham 004 E09000002 Barking and Dagenham E12000007
## 6 E02000007 Barking and Dagenham 006 E09000002 Barking and Dagenham E12000007
##   RGN11NM USUALRES HHOLDRES COMESTRES POPDEN HHOLDS AVHHOLDSZ
## 1  London     7375     7187       188   25.5   4385       1.6
## 2  London     6775     6724        51   31.3   2713       2.5
## 3  London    10045    10033        12   46.9   3834       2.6
## 4  London     6182     5937       245   24.8   2318       2.6
## 5  London     8562     8562         0   72.1   3183       2.7
## 6  London     8791     8672       119   50.6   3441       2.5
##                         geometry
## 1 MULTIPOLYGON (((531667.6 18...
## 2 MULTIPOLYGON (((548881.6 19...
## 3 MULTIPOLYGON (((549102.4 18...
## 4 MULTIPOLYGON (((551550 1873...
## 5 MULTIPOLYGON (((549099.6 18...
## 6 MULTIPOLYGON (((549819.9 18...

This looks like a conventional data.frame but has the additional column geometry containing the coordinates of each observation. st_geometry() returns only the geographic object and st_drop_geometry() only the data.frame without the coordinates. We can plot the object using mapview().

mapview(msoa.spdf[, "POPDEN"])

Census API (admin units)

Now that we have some boundaries and shapes of spatial units in London, we can start looking for different data sources to populate the geometries.

A good source for demographic data is for instance the 2011 census. Below we use the nomis API to retrieve population data for London, See the Vignette for more information (Guest users are limited to 25,000 rows per query). Below is a wrapper to avoid some mess with sex and urban-rural cross-tabs.

### For larger request, register and set key
# Sys.setenv(NOMIS_API_KEY = "XXX")
# nomis_api_key(check_env = TRUE)



# Get London ids
london_ids <- msoa.spdf$MSOA11CD

### Get key statistics ids
# select required tables (https://www.nomisweb.co.uk/sources/census_2011_ks)
# Let's get KS201EW (ethnic group) and KS402EW (housing tenure)

From the Nomis website https://www.nomisweb.co.uk/sources/census_2011_ks, we can get the external identifiers that we want to pull: KS201EW (ethnic group) and KS402EW (housing tenure). Unfortunately, internal keys differ (for what reason ever…).

# Get table of IDs and description
x <- nomis_data_info()
head(x)
## # A tibble: 6 × 14
##   agencyid id     uri    version annot…¹ compo…² compo…³ compo…⁴ compo…⁵ compo…⁶
##   <chr>    <chr>  <chr>    <dbl> <list>  <list>  <list>  <chr>   <chr>   <chr>  
## 1 NOMIS    NM_1_1 Nm-1d1       1 <df>    <df>    <df>    OBS_VA… CL_1_1… TIME   
## 2 NOMIS    NM_2_1 Nm-2d1       1 <df>    <df>    <df>    OBS_VA… CL_2_1… TIME   
## 3 NOMIS    NM_4_1 Nm-4d1       1 <df>    <df>    <df>    OBS_VA… CL_4_1… TIME   
## 4 NOMIS    NM_5_1 Nm-5d1       1 <df>    <df>    <df>    OBS_VA… CL_5_1… TIME   
## 5 NOMIS    NM_6_1 Nm-6d1       1 <df>    <df>    <df>    OBS_VA… CL_6_1… TIME   
## 6 NOMIS    NM_7_1 Nm-7d1       1 <df>    <df>    <df>    OBS_VA… CL_7_1… TIME   
## # … with 4 more variables: description.value <chr>, description.lang <chr>,
## #   name.value <chr>, name.lang <chr>, and abbreviated variable names
## #   ¹​annotations.annotation, ²​components.attribute, ³​components.dimension,
## #   ⁴​components.primarymeasure.conceptref, ⁵​components.timedimension.codelist,
## #   ⁶​components.timedimension.conceptref

So we have to find the internal ids of KS201EW and KS402EW.

# Get internal ids
stats <- c("KS201EW", "KS402EW")
oo <- which(grepl(paste(stats, collapse = "|"), x$name.value))
ksids <- x$id[oo]
ksids # This are the internal ids
## [1] "NM_608_1" "NM_619_1"

Finally, we can start pulling the information from the Nomis census APi! Below is a wrapper to avoid some mess with sex and urban-rural cross-tabs. Some characteristics can be pulled separately for urban-rural or by sex - In that case we have to specify it, but we cannot specify the option if a variable does not have urban-rural or sex differentials. The wrapper below also creates a codebook with variable descriptions.

### look at meta information
q <- nomis_overview(ksids[1])
head(q)
## # A tibble: 6 × 2
##   name           value           
##   <chr>          <list>          
## 1 analyses       <named list [1]>
## 2 analysisname   <chr [1]>       
## 3 analysisnumber <int [1]>       
## 4 contact        <named list [4]>
## 5 contenttypes   <named list [1]>
## 6 coverage       <chr [1]>
a <- nomis_get_metadata(id = ksids[1], concept = "GEOGRAPHY", type = "type")
a # TYPE297 is MSOA level
## # A tibble: 24 × 3
##    id      label.en                                           description.en    
##    <chr>   <chr>                                              <chr>             
##  1 TYPE265 NHS area teams                                     NHS area teams    
##  2 TYPE266 clinical commissioning groups                      clinical commissi…
##  3 TYPE267 built-up areas including subdivisions              built-up areas in…
##  4 TYPE269 built-up areas                                     built-up areas    
##  5 TYPE273 national assembly for wales electoral regions 2010 national assembly…
##  6 TYPE274 postcode areas                                     postcode areas    
##  7 TYPE275 postcode districts                                 postcode districts
##  8 TYPE276 postcode sectors                                   postcode sectors  
##  9 TYPE277 national assembly for wales constituencies 2010    national assembly…
## 10 TYPE279 parishes 2011                                      parishes 2011     
## # … with 14 more rows
b <- nomis_get_metadata(id = ksids[1], concept = "MEASURES", type = "TYPE297")
b # 20100 is the measure of absolute numbers
## # A tibble: 2 × 3
##   id    label.en description.en
##   <chr> <chr>    <chr>         
## 1 20100 value    value         
## 2 20301 percent  percent
### Query data in loop over the required statistics
for(i in ksids){

  # Determin if data is divided by sex or urban-rural
  nd <- nomis_get_metadata(id = i)
  if("RURAL_URBAN" %in% nd$conceptref){
    UR <- TRUE
  }else{
    UR <- FALSE
  }
  if("C_SEX" %in% nd$conceptref){
    SEX <- TRUE
  }else{
    SEX <- FALSE
  }

  # make data request
  if(UR == TRUE){
    if(SEX == TRUE){
      tmp_en <- nomis_get_data(id = i, time = "2011", 
                               geography = london_ids, # replace with "TYPE297" for all MSOAs
                               measures = 20100, RURAL_URBAN = 0, C_SEX = 0)
    }else{
      tmp_en <- nomis_get_data(id = i, time = "2011", 
                               geography = london_ids, # replace with "TYPE297" for all MSOAs
                               measures = 20100, RURAL_URBAN = 0)
    }
  }else{
    if(SEX == TRUE){
      tmp_en <- nomis_get_data(id = i, time = "2011", 
                               geography = london_ids, # replace with "TYPE297" for all MSOAs
                               measures = 20100, C_SEX = 0)
    }else{
      tmp_en <- nomis_get_data(id = i, time = "2011", 
                               geography = london_ids, # replace with "TYPE297" for all MSOAs
                               measures = 20100)
    }

  }

  # Append (in case of different regions)
  ks_tmp <- tmp_en

  # Make lower case names
  names(ks_tmp) <- tolower(names(ks_tmp))
  names(ks_tmp)[names(ks_tmp) == "geography_code"] <- "msoa11"
  names(ks_tmp)[names(ks_tmp) == "geography_name"] <- "name"

  # replace weird cell codes
  onlynum <- which(grepl("^[[:digit:]]+$", ks_tmp$cell_code))
  if(length(onlynum) != 0){
    code <- substr(ks_tmp$cell_code[-onlynum][1], 1, 7)
    if(is.na(code)){
      code <- i
    }
    ks_tmp$cell_code[onlynum] <- paste0(code, "_", ks_tmp$cell_code[onlynum])
  }

  # save codebook
  ks_cb <- unique(ks_tmp[, c("date", "cell_type", "cell", "cell_code", "cell_name")])

  ### Reshape
  ks_res <- tidyr::pivot_wider(ks_tmp, id_cols = c("msoa11", "name"),
                               names_from = "cell_code",
                               values_from = "obs_value")

  ### Merge
  if(i == ksids[1]){
    census_keystat.df <- ks_res
    census_keystat_cb.df <- ks_cb
  }else{
    census_keystat.df <- merge(census_keystat.df, ks_res, by = c("msoa11", "name"), all = TRUE)
    census_keystat_cb.df <- rbind(census_keystat_cb.df, ks_cb)
  }

}


# Descirption are saved in the codebook
head(census_keystat_cb.df)
## # A tibble: 6 × 5
##    date cell_type     cell cell_code   cell_name                                
##   <dbl> <chr>        <dbl> <chr>       <chr>                                    
## 1  2011 Ethnic Group     0 KS201EW0001 All usual residents                      
## 2  2011 Ethnic Group   100 KS201EW_100 White                                    
## 3  2011 Ethnic Group     1 KS201EW0002 White: English/Welsh/Scottish/Northern I…
## 4  2011 Ethnic Group     2 KS201EW0003 White: Irish                             
## 5  2011 Ethnic Group     3 KS201EW0004 White: Gypsy or Irish Traveller          
## 6  2011 Ethnic Group     4 KS201EW0005 White: Other White
### Merge with MSOA geometries above
msoa.spdf <- merge(msoa.spdf, census_keystat.df, 
                   by.x = "MSOA11CD", by.y = "msoa11", all.x = TRUE)

Now we can, for instance, plot the spatial distribution of ethnic groups.

msoa.spdf$per_white <- msoa.spdf$KS201EW_100 / msoa.spdf$KS201EW0001 * 100

mapview(msoa.spdf[, "per_white"])

References

Biegert, Thomas, Michael Kühhirt, and Wim Van Lancker. 2023. “They Can’t All Be Stars: The Matthew Effect, Cumulative Status Bias, and Status Persistence in NBA All-Star Elections.” American Sociological Review, March, 000312242311591. https://doi.org/10.1177/00031224231159139.